library(readr)
library(tibble)
library(tidyr)

Attaching package: ‘tidyr’

The following object is masked from ‘package:Matrix’:

    expand
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(tidytext)
library(ggplot2)
library(tm)
Loading required package: NLP

Attaching package: ‘NLP’

The following object is masked from ‘package:ggplot2’:

    annotate
library(topicmodels)
library(stringr)
weeknotes_ryan <- as_tibble(read_csv('weeknotes_ryan.csv', col_names = FALSE))
Parsed with column specification:
cols(
  X1 = col_character(),
  X2 = col_character(),
  X3 = col_character(),
  X4 = col_character(),
  X5 = col_character(),
  X6 = col_character(),
  X7 = col_character(),
  X8 = col_character(),
  X9 = col_character(),
  X10 = col_character()
)
names(weeknotes_ryan) <- c( "s02e01",
                            "s01e09",
                            "s01e08",
                            "s01e07",
                            "s01e06",
                            "s01e05",
                            "s01e04",
                            "s01e03",
                            "s01e02",      
                            "s01e01"
       )
weeknotes_ryan %>% 
  gather("episode", "text", 1:10) -> tidy_notes
tidy_notes
tidy_notes %>% 
  mutate(text2 = gsub("[.]", " ", text)) -> tidier_notes
tidier_notes %>% 
  unnest_tokens(word, text2) -> tidy_tokens
tidy_tokens
data("stop_words")
other_stop_words <- c("i’d", "i’m", "i’ve")
tidy_tokens %>% 
  anti_join(stop_words) %>% 
  filter(!(word %in% other_stop_words)) -> tidier_wo_stopwords
Joining, by = "word"
tidier_wo_stopwords %>% 
  filter(str_detect(word, regex("i’d", ignore_case = TRUE)))
tidier_wo_stopwords %>% 
  filter(episode == 's01e01') %>% 
  count(word, sort=TRUE)
tidier_wo_stopwords %>% 
  filter(episode == 's02e01') %>% 
  count(word, sort=TRUE)
library(ggplot2)
tidier_wo_stopwords %>%
  count(word, sort = TRUE) %>%
  filter(n > 30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

tidier_wo_stopwords
tidier_wo_stopwords %>%
  count(episode, word, sort = TRUE) %>%
  ungroup() -> kk
kk %>% 
  bind_tf_idf(word, episode, n) -> tidier_tf_idf 
tidier_tf_idf %>% 
  arrange(desc(tf_idf))
tidier_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) -> plot_episodes_tf_idf
plot_episodes_tf_idf %>% 
  group_by(episode) %>% 
  top_n(7) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = episode)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~episode, ncol = 2, scales = "free") +
  coord_flip()
Selecting by tf_idf

tidier_notes %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) -> bigrams
bigrams %>%
  count(bigram, sort = TRUE)
bigrams_separated <- bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(!word2 %in% other_stop_words) %>% 
  filter(!word2 %in% other_stop_words)
  
# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)
bigram_counts
tidier_notes %>% 
  unnest_tokens(trigram, text, token = "ngrams", n = 3) -> trigrams
trigrams_separated <- trigrams %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ")
trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word) %>% 
  filter(!word1 %in% other_stop_words) %>% 
  filter(!word3 %in% other_stop_words)
  
# new bigram counts:
trigram_counts <- trigrams_filtered %>% 
  count(word1, word2, word3, sort = TRUE)
trigram_counts
tidier_tf_idf %>% 
  cast_dtm(document = episode,term = word, value = n) -> dtm_matrix
dtm_matrix
<<DocumentTermMatrix (documents: 10, terms: 2204)>>
Non-/sparse entries: 4623/17417
Sparsity           : 79%
Maximal term length: 19
Weighting          : term frequency (tf)
tidier_tf_idf %>% 
  cast_sparse(episode, word, n) -> sparse_matrix
str(sparse_matrix)
Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ i       : int [1:4623] 0 1 2 3 4 5 6 7 8 9 ...
  ..@ p       : int [1:2205] 0 10 20 24 34 42 52 62 72 82 ...
  ..@ Dim     : int [1:2] 10 2204
  ..@ Dimnames:List of 2
  .. ..$ : chr [1:10] "s01e05" "s01e09" "s01e08" "s01e01" ...
  .. ..$ : chr [1:2204] "data" "call" "event" "meeting" ...
  ..@ x       : num [1:4623] 31 26 25 23 20 20 17 17 14 11 ...
  ..@ factors : list()
txt_lda <- LDA(dtm_matrix, k = 2, control = list(seed = 1234))
txt_topics <- tidy(txt_lda, matrix = "beta")
top_terms <- txt_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

LS0tCnRpdGxlOiAiV2Vla2x5IG5vdGVzIHRleHQgYW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpsaWJyYXJ5KHJlYWRyKQpsaWJyYXJ5KHRpYmJsZSkKbGlicmFyeSh0aWR5cikKbGlicmFyeShkcGx5cikKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KHRvcGljbW9kZWxzKQpsaWJyYXJ5KHN0cmluZ3IpCmBgYAoKYGBge3J9CndlZWtub3Rlc19yeWFuIDwtIGFzX3RpYmJsZShyZWFkX2Nzdignd2Vla25vdGVzX3J5YW4uY3N2JywgY29sX25hbWVzID0gRkFMU0UpKQpuYW1lcyh3ZWVrbm90ZXNfcnlhbikgPC0gYyggInMwMmUwMSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAiczAxZTA5IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICJzMDFlMDgiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgInMwMWUwNyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAiczAxZTA2IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICJzMDFlMDUiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgInMwMWUwNCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAiczAxZTAzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICJzMDFlMDIiLCAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgInMwMWUwMSIKICAgICAgICkKYGBgCgoKCmBgYHtyfQp3ZWVrbm90ZXNfcnlhbiAlPiUgCiAgZ2F0aGVyKCJlcGlzb2RlIiwgInRleHQiLCAxOjEwKSAtPiB0aWR5X25vdGVzCmBgYAoKYGBge3J9CnRpZHlfbm90ZXMKYGBgCgpgYGB7cn0KdGlkeV9ub3RlcyAlPiUgCiAgbXV0YXRlKHRleHQyID0gZ3N1YigiWy5dIiwgIiAiLCB0ZXh0KSkgLT4gdGlkaWVyX25vdGVzCmBgYAoKYGBge3J9CnRpZGllcl9ub3RlcyAlPiUgCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0MikgLT4gdGlkeV90b2tlbnMKYGBgCgpgYGB7cn0KdGlkeV90b2tlbnMKYGBgCgoKCmBgYHtyfQpkYXRhKCJzdG9wX3dvcmRzIikKb3RoZXJfc3RvcF93b3JkcyA8LSBjKCJp4oCZZCIsICJp4oCZbSIsICJp4oCZdmUiKQpgYGAKCmBgYHtyfQp0aWR5X3Rva2VucyAlPiUgCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpICU+JSAKICBmaWx0ZXIoISh3b3JkICVpbiUgb3RoZXJfc3RvcF93b3JkcykpIC0+IHRpZGllcl93b19zdG9wd29yZHMKYGBgCgoKYGBge3J9CnRpZGllcl93b19zdG9wd29yZHMgJT4lIAogIGZpbHRlcihzdHJfZGV0ZWN0KHdvcmQsIHJlZ2V4KCJp4oCZZCIsIGlnbm9yZV9jYXNlID0gVFJVRSkpKQpgYGAKCmBgYHtyfQp0aWRpZXJfd29fc3RvcHdvcmRzICU+JSAKICBmaWx0ZXIoZXBpc29kZSA9PSAnczAxZTAxJykgJT4lIAogIGNvdW50KHdvcmQsIHNvcnQ9VFJVRSkKYGBgCgpgYGB7cn0KdGlkaWVyX3dvX3N0b3B3b3JkcyAlPiUgCiAgZmlsdGVyKGVwaXNvZGUgPT0gJ3MwMmUwMScpICU+JSAKICBjb3VudCh3b3JkLCBzb3J0PVRSVUUpCmBgYAoKYGBge3J9CmxpYnJhcnkoZ2dwbG90MikKCnRpZGllcl93b19zdG9wd29yZHMgJT4lCiAgY291bnQod29yZCwgc29ydCA9IFRSVUUpICU+JQogIGZpbHRlcihuID4gMzApICU+JQogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lCiAgZ2dwbG90KGFlcyh3b3JkLCBuKSkgKwogIGdlb21fY29sKCkgKwogIHhsYWIoTlVMTCkgKwogIGNvb3JkX2ZsaXAoKQpgYGAKCmBgYHtyfQp0aWRpZXJfd29fc3RvcHdvcmRzCmBgYAoKYGBge3J9CnRpZGllcl93b19zdG9wd29yZHMgJT4lCiAgY291bnQoZXBpc29kZSwgd29yZCwgc29ydCA9IFRSVUUpICU+JQogIHVuZ3JvdXAoKSAtPiBrawoKa2sgJT4lIAogIGJpbmRfdGZfaWRmKHdvcmQsIGVwaXNvZGUsIG4pIC0+IHRpZGllcl90Zl9pZGYgCmBgYAoKYGBge3J9CnRpZGllcl90Zl9pZGYgJT4lIAogIGFycmFuZ2UoZGVzYyh0Zl9pZGYpKQpgYGAKCmBgYHtyfQp0aWRpZXJfdGZfaWRmICU+JQogIGFycmFuZ2UoZGVzYyh0Zl9pZGYpKSAlPiUKICBtdXRhdGUod29yZCA9IGZhY3Rvcih3b3JkLCBsZXZlbHMgPSByZXYodW5pcXVlKHdvcmQpKSkpIC0+IHBsb3RfZXBpc29kZXNfdGZfaWRmCmBgYAoKYGBge3J9CnBsb3RfZXBpc29kZXNfdGZfaWRmICU+JSAKICBncm91cF9ieShlcGlzb2RlKSAlPiUgCiAgdG9wX24oNykgJT4lIAogIHVuZ3JvdXAgJT4lCiAgZ2dwbG90KGFlcyh3b3JkLCB0Zl9pZGYsIGZpbGwgPSBlcGlzb2RlKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gInRmLWlkZiIpICsKICBmYWNldF93cmFwKH5lcGlzb2RlLCBuY29sID0gMiwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpCmBgYAoKYGBge3J9CnRpZGllcl9ub3RlcyAlPiUgCiAgdW5uZXN0X3Rva2VucyhiaWdyYW0sIHRleHQsIHRva2VuID0gIm5ncmFtcyIsIG4gPSAyKSAtPiBiaWdyYW1zCmBgYAoKYGBge3J9CmJpZ3JhbXMgJT4lCiAgY291bnQoYmlncmFtLCBzb3J0ID0gVFJVRSkKYGBgCgpgYGB7cn0KYmlncmFtc19zZXBhcmF0ZWQgPC0gYmlncmFtcyAlPiUKICBzZXBhcmF0ZShiaWdyYW0sIGMoIndvcmQxIiwgIndvcmQyIiksIHNlcCA9ICIgIikKCmJpZ3JhbXNfZmlsdGVyZWQgPC0gYmlncmFtc19zZXBhcmF0ZWQgJT4lCiAgZmlsdGVyKCF3b3JkMSAlaW4lIHN0b3Bfd29yZHMkd29yZCkgJT4lCiAgZmlsdGVyKCF3b3JkMiAlaW4lIHN0b3Bfd29yZHMkd29yZCkgJT4lIAogIGZpbHRlcighd29yZDIgJWluJSBvdGhlcl9zdG9wX3dvcmRzKSAlPiUgCiAgZmlsdGVyKCF3b3JkMiAlaW4lIG90aGVyX3N0b3Bfd29yZHMpCiAgCgojIG5ldyBiaWdyYW0gY291bnRzOgpiaWdyYW1fY291bnRzIDwtIGJpZ3JhbXNfZmlsdGVyZWQgJT4lIAogIGNvdW50KHdvcmQxLCB3b3JkMiwgc29ydCA9IFRSVUUpCgpiaWdyYW1fY291bnRzCmBgYAoKYGBge3J9CnRpZGllcl9ub3RlcyAlPiUgCiAgdW5uZXN0X3Rva2Vucyh0cmlncmFtLCB0ZXh0LCB0b2tlbiA9ICJuZ3JhbXMiLCBuID0gMykgLT4gdHJpZ3JhbXMKCnRyaWdyYW1zX3NlcGFyYXRlZCA8LSB0cmlncmFtcyAlPiUKICBzZXBhcmF0ZSh0cmlncmFtLCBjKCJ3b3JkMSIsICJ3b3JkMiIsICJ3b3JkMyIpLCBzZXAgPSAiICIpCgp0cmlncmFtc19maWx0ZXJlZCA8LSB0cmlncmFtc19zZXBhcmF0ZWQgJT4lCiAgZmlsdGVyKCF3b3JkMSAlaW4lIHN0b3Bfd29yZHMkd29yZCkgJT4lCiAgZmlsdGVyKCF3b3JkMyAlaW4lIHN0b3Bfd29yZHMkd29yZCkgJT4lIAogIGZpbHRlcighd29yZDEgJWluJSBvdGhlcl9zdG9wX3dvcmRzKSAlPiUgCiAgZmlsdGVyKCF3b3JkMyAlaW4lIG90aGVyX3N0b3Bfd29yZHMpCiAgCgojIG5ldyBiaWdyYW0gY291bnRzOgp0cmlncmFtX2NvdW50cyA8LSB0cmlncmFtc19maWx0ZXJlZCAlPiUgCiAgY291bnQod29yZDEsIHdvcmQyLCB3b3JkMywgc29ydCA9IFRSVUUpCgp0cmlncmFtX2NvdW50cwpgYGAKCmBgYHtyfQp0aWRpZXJfdGZfaWRmICU+JSAKICBjYXN0X2R0bShkb2N1bWVudCA9IGVwaXNvZGUsdGVybSA9IHdvcmQsIHZhbHVlID0gbikgLT4gZHRtX21hdHJpeApgYGAKCmBgYHtyfQpkdG1fbWF0cml4CmBgYAoKCmBgYHtyfQp0aWRpZXJfdGZfaWRmICU+JSAKICBjYXN0X3NwYXJzZShlcGlzb2RlLCB3b3JkLCBuKSAtPiBzcGFyc2VfbWF0cml4CmBgYAoKYGBge3J9CnN0cihzcGFyc2VfbWF0cml4KQpgYGAKCmBgYHtyfQp0eHRfbGRhIDwtIExEQShkdG1fbWF0cml4LCBrID0gMiwgY29udHJvbCA9IGxpc3Qoc2VlZCA9IDEyMzQpKQpgYGAKCgpgYGB7cn0KdHh0X3RvcGljcyA8LSB0aWR5KHR4dF9sZGEsIG1hdHJpeCA9ICJiZXRhIikKdG9wX3Rlcm1zIDwtIHR4dF90b3BpY3MgJT4lCiAgZ3JvdXBfYnkodG9waWMpICU+JQogIHRvcF9uKDEwLCBiZXRhKSAlPiUKICB1bmdyb3VwKCkgJT4lCiAgYXJyYW5nZSh0b3BpYywgLWJldGEpCgp0b3BfdGVybXMgJT4lCiAgbXV0YXRlKHRlcm0gPSByZW9yZGVyKHRlcm0sIGJldGEpKSAlPiUKICBnZ3Bsb3QoYWVzKHRlcm0sIGJldGEsIGZpbGwgPSBmYWN0b3IodG9waWMpKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBmYWNldF93cmFwKH4gdG9waWMsIHNjYWxlcyA9ICJmcmVlIikgKwogIGNvb3JkX2ZsaXAoKSArCiAgdGhlbWVfYncoKQpgYGAKCg==